From 88b71fd5cc7b407dd7d9a8ad92f940cb393b0a1b Mon Sep 17 00:00:00 2001 From: "iap10@tetris.cl.cam.ac.uk" Date: Thu, 25 Mar 2004 01:50:35 +0000 Subject: [PATCH] bitkeeper revision 1.794.1.4 (40623aebq_XP4MvV6YJsXGleofDYNg) shadow mode improvements : use hash table to avoid increasing pfn_info size. improved locking in preparation for SMP guests. --- xen/arch/i386/process.c | 17 +- xen/arch/i386/traps.c | 4 +- xen/common/debug.c | 6 +- xen/common/domain.c | 27 +-- xen/common/memory.c | 103 +++++----- xen/common/perfc.c | 2 +- xen/common/shadow.c | 338 +++++++++++++++++++------------ xen/include/asm-i386/config.h | 2 - xen/include/asm-i386/processor.h | 13 +- xen/include/xeno/mm.h | 4 - xen/include/xeno/shadow.h | 313 +++++++++++++++++++++++++++- xen/net/dev.c | 39 ++-- 12 files changed, 607 insertions(+), 261 deletions(-) diff --git a/xen/arch/i386/process.c b/xen/arch/i386/process.c index 2c53801089..f4b2ef4006 100644 --- a/xen/arch/i386/process.c +++ b/xen/arch/i386/process.c @@ -282,25 +282,14 @@ void switch_to(struct task_struct *prev_p, struct task_struct *next_p) } /* Switch page tables. */ -#ifdef CONFIG_SHADOW - - /* printk("switch_to %08lx, %08lx\n", next_p->mm.pagetable, - next_p->mm.shadowtable);*/ - - - if( next_p->mm.shadowmode ) + if( next_p->mm.shadow_mode ) { - check_pagetable( next_p->mm.pagetable, "switch" ); - write_cr3_counted(pagetable_val(next_p->mm.shadowtable)); + check_pagetable( next_p, next_p->mm.pagetable, "switch" ); + write_cr3_counted(pagetable_val(next_p->mm.shadow_table)); } else -#endif write_cr3_counted(pagetable_val(next_p->mm.pagetable)); - - - - set_current(next_p); /* Switch GDT and LDT. */ diff --git a/xen/arch/i386/traps.c b/xen/arch/i386/traps.c index eed1fb9faf..f35e0c898d 100644 --- a/xen/arch/i386/traps.c +++ b/xen/arch/i386/traps.c @@ -339,13 +339,11 @@ asmlinkage void do_page_fault(struct pt_regs *regs, long error_code) return; /* successfully copied the mapping */ } -#ifdef CONFIG_SHADOW - if ( p->mm.shadowmode && addr < PAGE_OFFSET && + if ( unlikely( p->mm.shadow_mode ) && addr < PAGE_OFFSET && shadow_fault( addr, error_code ) ) { return; // return true if fault was handled } -#endif if ( unlikely(!(regs->xcs & 3)) ) goto fault_in_hypervisor; diff --git a/xen/common/debug.c b/xen/common/debug.c index 4e298bbfb5..2956b0ea8c 100644 --- a/xen/common/debug.c +++ b/xen/common/debug.c @@ -91,11 +91,9 @@ int pdb_change_values(domid_t domain, u_char *buffer, unsigned long addr, if ((addr >> PAGE_SHIFT) == ((addr + length - 1) >> PAGE_SHIFT)) { -#ifdef CONFIG_SHADOW - if (p->mm.shadowmode ) - l2_table = map_domain_mem(pagetable_val(p->mm.shadowtable)); + if (p->mm.shadow_mode ) + l2_table = map_domain_mem(pagetable_val(p->mm.shadow_table)); else -#endif l2_table = map_domain_mem(pagetable_val(p->mm.pagetable)); l2_table += l2_table_offset(addr); diff --git a/xen/common/domain.c b/xen/common/domain.c index c63c9164e3..360677458d 100644 --- a/xen/common/domain.c +++ b/xen/common/domain.c @@ -341,12 +341,14 @@ void free_domain_page(struct pfn_info *page) if ( !(page->count_and_flags & PGC_zombie) ) { page->tlbflush_timestamp = tlbflush_clock; - page->u.cpu_mask = 1 << p->processor; - - spin_lock(&p->page_list_lock); - list_del(&page->list); - p->tot_pages--; - spin_unlock(&p->page_list_lock); + if (p) + { + page->u.cpu_mask = 1 << p->processor; + spin_lock(&p->page_list_lock); + list_del(&page->list); + p->tot_pages--; + spin_unlock(&p->page_list_lock); + } } page->count_and_flags = 0; @@ -547,10 +549,6 @@ int final_setup_guestos(struct task_struct *p, dom0_builddomain_t *builddomain) get_page_and_type(&frame_table[phys_l2tab>>PAGE_SHIFT], p, PGT_l2_page_table); -#ifdef CONFIG_SHADOW - p->mm.shadowtable = shadow_mk_pagetable(phys_l2tab, p->mm.shadowmode); -#endif - /* Set up the shared info structure. */ update_dom_time(p->shared_info); @@ -852,15 +850,10 @@ int setup_guestos(struct task_struct *p, dom0_createdomain_t *params, set_bit(PF_CONSTRUCTED, &p->flags); -#ifdef CONFIG_SHADOW - -printk("Engage shadow mode for dom 0\n"); - p->mm.shadowmode = SHM_test; // XXXXX IAP - p->mm.shadowtable = shadow_mk_pagetable(phys_l2tab, p->mm.shadowmode ); +#if 1 // XXXXX IAP DO NOT CHECK IN ENBALED !!!!!!! + shadow_mode_enable(p, SHM_test); #endif - - new_thread(p, (unsigned long)virt_load_address, (unsigned long)virt_stack_address, diff --git a/xen/common/memory.c b/xen/common/memory.c index f6e8155f71..1c54713006 100644 --- a/xen/common/memory.c +++ b/xen/common/memory.c @@ -765,20 +765,22 @@ void free_page_type(struct pfn_info *page, unsigned int type) { case PGT_l1_page_table: free_l1_table(page); -#ifdef CONFIG_SHADOW - // assume we're in shadow mode if PSH_shadowed set - if ( current->mm.shadowmode && page->shadow_and_flags & PSH_shadowed ) + if ( unlikely(current->mm.shadow_mode) && + (get_shadow_status(current, page-frame_table) & PSH_shadowed) ) + { unshadow_table( page-frame_table, type ); -#endif + put_shadow_status(current); + } return; case PGT_l2_page_table: free_l2_table(page); -#ifdef CONFIG_SHADOW - // assume we're in shadow mode if PSH_shadowed set - if ( current->mm.shadowmode && page->shadow_and_flags & PSH_shadowed ) + if ( unlikely(current->mm.shadow_mode) && + (get_shadow_status(current, page-frame_table) & PSH_shadowed) ) + { unshadow_table( page-frame_table, type ); -#endif + put_shadow_status(current); + } return; default: @@ -848,21 +850,22 @@ static int do_extended_command(unsigned long ptr, unsigned long val) put_page_and_type(&frame_table[pagetable_val(current->mm.pagetable) >> PAGE_SHIFT]); current->mm.pagetable = mk_pagetable(pfn << PAGE_SHIFT); -#ifdef CONFIG_SHADOW - current->mm.shadowtable = - shadow_mk_pagetable(pfn << PAGE_SHIFT, current->mm.shadowmode); -#endif - invalidate_shadow_ldt(); + if( unlikely(current->mm.shadow_mode)) + current->mm.shadow_table = + shadow_mk_pagetable(current, pfn<mm.shadowmode) ) + if ( unlikely(current->mm.shadow_mode) ) { - check_pagetable( current->mm.pagetable, "pre-stlb-flush" ); - write_cr3_counted(pagetable_val(current->mm.shadowtable)); + check_pagetable( current, + current->mm.pagetable, "pre-stlb-flush" ); + write_cr3_counted(pagetable_val(current->mm.shadow_table)); } else -#endif write_cr3_counted(pagetable_val(current->mm.pagetable)); } else @@ -947,10 +950,8 @@ int do_mmu_update(mmu_update_t *ureqs, int count) struct pfn_info *page; int rc = 0, okay = 1, i, cpu = smp_processor_id(); unsigned int cmd; -#ifdef CONFIG_SHADOW unsigned long prev_spfn = 0; l1_pgentry_t *prev_spl1e = 0; -#endif perfc_incrc(calls_to_mmu_update); perfc_addc(num_page_updates, count); @@ -1002,11 +1003,14 @@ int do_mmu_update(mmu_update_t *ureqs, int count) okay = mod_l1_entry((l1_pgentry_t *)va, mk_l1_pgentry(req.val)); -#ifdef CONFIG_SHADOW - if ( okay && page->shadow_and_flags & PSH_shadowed ) + if ( okay && unlikely(current->mm.shadow_mode) && + (get_shadow_status(current, page-frame_table) & + PSH_shadowed) ) + { shadow_l1_normal_pt_update( req.ptr, req.val, &prev_spfn, &prev_spl1e ); -#endif + put_shadow_status(current); + } put_page_type(page); } @@ -1017,10 +1021,14 @@ int do_mmu_update(mmu_update_t *ureqs, int count) okay = mod_l2_entry((l2_pgentry_t *)va, mk_l2_pgentry(req.val), pfn); -#ifdef CONFIG_SHADOW - if ( okay && page->shadow_and_flags & PSH_shadowed ) + + if ( okay && unlikely(current->mm.shadow_mode) && + (get_shadow_status(current, page-frame_table) & + PSH_shadowed) ) + { shadow_l2_normal_pt_update( req.ptr, req.val ); -#endif + put_shadow_status(current); + } put_page_type(page); } @@ -1032,19 +1040,11 @@ int do_mmu_update(mmu_update_t *ureqs, int count) okay = 1; put_page_type(page); -#ifdef CONFIG_SHADOW - if ( page->shadow_and_flags & PSH_shadowed ) - BUG(); - // at present, we shouldn't be shadowing such pages -#endif - - + // at present, we don't shadowing such pages } break; } -check_pagetable( current->mm.pagetable, "mmu" ); // XXX XXX XXX XXX XXX - put_page(page); break; @@ -1087,25 +1087,22 @@ check_pagetable( current->mm.pagetable, "mmu" ); // XXX XXX XXX XXX XXX if ( prev_pfn != 0 ) unmap_domain_mem((void *)va); -#ifdef CONFIG_SHADOW if( prev_spl1e != 0 ) unmap_domain_mem((void *)prev_spl1e); -#endif deferred_ops = percpu_info[cpu].deferred_ops; percpu_info[cpu].deferred_ops = 0; if ( deferred_ops & DOP_FLUSH_TLB ) { -#ifdef CONFIG_SHADOW - if ( unlikely(current->mm.shadowmode) ) + if ( unlikely(current->mm.shadow_mode) ) { - check_pagetable( current->mm.pagetable, "pre-stlb-flush" ); - write_cr3_counted(pagetable_val(current->mm.shadowtable)); + check_pagetable( current, + current->mm.pagetable, "pre-stlb-flush" ); + write_cr3_counted(pagetable_val(current->mm.shadow_table)); } else -#endif - write_cr3_counted(pagetable_val(current->mm.pagetable)); + write_cr3_counted(pagetable_val(current->mm.pagetable)); } if ( deferred_ops & DOP_RELOAD_LDT ) @@ -1142,9 +1139,7 @@ int do_update_va_mapping(unsigned long page_nr, mk_l1_pgentry(val))) ) err = -EINVAL; -#ifdef CONFIG_SHADOW - - if ( unlikely(p->mm.shadowmode) ) + if ( unlikely(p->mm.shadow_mode) ) { unsigned long sval = 0; @@ -1164,14 +1159,14 @@ int do_update_va_mapping(unsigned long page_nr, { // Since L2's are guranteed RW, failure indicates the page // was not shadowed, so ignore. - + perfc_incrc(shadow_update_va_fail); //MEM_LOG("update_va_map: couldn't write update\n"); } - } -check_pagetable( p->mm.pagetable, "va" ); + check_pagetable( p, p->mm.pagetable, "va" ); // debug + + } -#endif deferred_ops = percpu_info[cpu].deferred_ops; percpu_info[cpu].deferred_ops = 0; @@ -1179,12 +1174,10 @@ check_pagetable( p->mm.pagetable, "va" ); if ( unlikely(deferred_ops & DOP_FLUSH_TLB) || unlikely(flags & UVMF_FLUSH_TLB) ) { -#ifdef CONFIG_SHADOW - if ( unlikely(p->mm.shadowmode) ) - write_cr3_counted(pagetable_val(p->mm.shadowtable)); + if ( unlikely(p->mm.shadow_mode) ) + write_cr3_counted(pagetable_val(p->mm.shadow_table)); else -#endif - write_cr3_counted(pagetable_val(p->mm.pagetable)); + write_cr3_counted(pagetable_val(p->mm.pagetable)); } else if ( unlikely(flags & UVMF_INVLPG) ) __flush_tlb_one(page_nr << PAGE_SHIFT); diff --git a/xen/common/perfc.c b/xen/common/perfc.c index af9abbb67c..1d24c8fd04 100644 --- a/xen/common/perfc.c +++ b/xen/common/perfc.c @@ -103,7 +103,7 @@ void perfc_reset(u_char key, void *dev_id, struct pt_regs *regs) for ( j = sum = 0; j < perfc_info[i].nr_elements; j++ ) atomic_set(&counters[j],0); case TYPE_S_ARRAY: - counters += j; + counters += perfc_info[i].nr_elements; break; } } diff --git a/xen/common/shadow.c b/xen/common/shadow.c index a0df57d8f1..c1e25f5a52 100644 --- a/xen/common/shadow.c +++ b/xen/common/shadow.c @@ -7,70 +7,161 @@ #include #include -#ifdef CONFIG_SHADOW - - -#if SHADOW_DEBUG -#define MEM_VLOG(_f, _a...) \ - printk("DOM%llu: (file=shadow.c, line=%d) " _f "\n", \ - current->domain , __LINE__ , ## _a ) -#else -#define MEM_VLOG(_f, _a...) -#endif - -#if 0 -#define MEM_VVLOG(_f, _a...) \ - printk("DOM%llu: (file=shadow.c, line=%d) " _f "\n", \ - current->domain , __LINE__ , ## _a ) -#else -#define MEM_VVLOG(_f, _a...) -#endif - /******** To use these shadow page tables, guests must not rely on the ACCESSED and DIRTY bits on L2 pte's being accurate -- they will typically all be set. + I doubt this will break anything. (If guests want to use the va_update mechanism they've signed up for this anyhow...) ********/ -pagetable_t shadow_mk_pagetable( unsigned long gptbase, - unsigned int shadowmode ) +int shadow_mode_enable( struct task_struct *p, unsigned int mode ) { - unsigned long gpfn, spfn=0; + struct shadow_status **fptr; + int i; + + // sychronously stop domain + // XXX for the moment, only use on already stopped domains!!! + + spin_lock_init(&p->mm.shadow_lock); + spin_lock(&p->mm.shadow_lock); + + p->mm.shadow_mode = mode; + + // allocate hashtable + p->mm.shadow_ht = kmalloc( shadow_ht_buckets * + sizeof(struct shadow_status), GFP_KERNEL ); + if( ! p->mm.shadow_ht ) + goto nomem; - MEM_VVLOG("shadow_mk_pagetable( gptbase=%08lx, mode=%d )", - gptbase, shadowmode ); + memset( p->mm.shadow_ht, 0, shadow_ht_buckets * + sizeof(struct shadow_status) ); - if ( unlikely(shadowmode) ) + + // allocate space for first lot of extra nodes + p->mm.shadow_ht_extras = kmalloc( sizeof(void*) + (shadow_ht_extra_size * + sizeof(struct shadow_status)), GFP_KERNEL ); + + if( ! p->mm.shadow_ht_extras ) + goto nomem; + + memset( p->mm.shadow_ht_extras, 0, sizeof(void*) + (shadow_ht_extra_size * + sizeof(struct shadow_status)) ); + + // add extras to free list + fptr = &p->mm.shadow_ht_free; + for ( i=0; i> PAGE_SHIFT; - - if ( likely(frame_table[gpfn].shadow_and_flags & PSH_shadowed) ) - { - spfn = frame_table[gpfn].shadow_and_flags & PSH_pfn_mask; + *fptr = &p->mm.shadow_ht_extras[i]; + fptr = &(p->mm.shadow_ht_extras[i].next); + } + *fptr = NULL; + *((struct shadow_status ** ) &p->mm.shadow_ht_extras[shadow_ht_extra_size]) = NULL; + + spin_unlock(&p->mm.shadow_lock); + + // call shadow_mk_pagetable + p->mm.shadow_table = shadow_mk_pagetable( p, + pagetable_val(p->mm.pagetable) ); + + return 0; + +nomem: + spin_unlock(&p->mm.shadow_lock); + return -ENOMEM; +} + +void shadow_mode_disable( ) +{ + + // free the hash buckets as you go + + // free the hashtable itself +} + + +static inline void free_shadow_page( struct task_struct *p, unsigned int pfn ) +{ + unsigned long flags; + + p->mm.shadow_page_count--; + + spin_lock_irqsave(&free_list_lock, flags); + list_add(&frame_table[pfn].list, &free_list); + free_pfns++; + spin_unlock_irqrestore(&free_list_lock, flags); +} + +static inline struct pfn_info *alloc_shadow_page( struct task_struct *p ) +{ + p->mm.shadow_page_count++; + + return alloc_domain_page( NULL ); +} + + +static void __free_shadow_table( struct task_struct *p ) +{ + int j; + struct shadow_status *a; + + // the code assumes you're not using the page tables i.e. + // the domain is stopped and cr3 is something else!! + + // walk the hash table and call free_shadow_page on all pages + + for(j=0;jmm.shadow_ht[j]; + if (a->pfn) + { + free_shadow_page( p, a->spfn_and_flags & PSH_pfn_mask ); + a->pfn = 0; + a->spfn_and_flags = 0; + } + a=a->next; + while(a) + { + struct shadow_status *next = a->next; + free_shadow_page( p, a->spfn_and_flags & PSH_pfn_mask ); + a->pfn = 0; + a->spfn_and_flags = 0; + a->next = p->mm.shadow_ht_free; + p->mm.shadow_ht_free = a; + a=next; } - else - { - spfn = shadow_l2_table( gpfn ); - } } +} + +static void flush_shadow_table( struct task_struct *p ) +{ + + // XXX synchronously stop domain (needed for SMP guests) + + // switch to idle task's page tables + + // walk the hash table and call free_shadow_page on all pages + spin_lock(&p->mm.shadow_lock); + __free_shadow_table( p ); + spin_unlock(&p->mm.shadow_lock); - return mk_pagetable(spfn << PAGE_SHIFT); + // XXX unpause domain } + + void unshadow_table( unsigned long gpfn, unsigned int type ) { unsigned long spfn; - MEM_VLOG("unshadow_table type=%08x gpfn=%08lx, spfn=%08lx", + SH_VLOG("unshadow_table type=%08x gpfn=%08lx", type, - gpfn, - frame_table[gpfn].shadow_and_flags & PSH_pfn_mask ); + gpfn ); perfc_incrc(unshadow_table_count); @@ -79,9 +170,8 @@ void unshadow_table( unsigned long gpfn, unsigned int type ) // even in the SMP guest case, there won't be a race here as // this CPU was the one that cmpxchg'ed the page to invalid - spfn = frame_table[gpfn].shadow_and_flags & PSH_pfn_mask; - frame_table[gpfn].shadow_and_flags=0; - frame_table[spfn].shadow_and_flags=0; + spfn = __shadow_status(current, gpfn) & PSH_pfn_mask; + delete_shadow_status(current, gpfn); #if 0 // XXX leave as might be useful for later debugging { @@ -101,27 +191,21 @@ void unshadow_table( unsigned long gpfn, unsigned int type ) else perfc_decr(shadow_l2_pages); - //free_domain_page( &frame_table[spfn] ); - - { - unsigned long flags; - spin_lock_irqsave(&free_list_lock, flags); - list_add(&frame_table[spfn].list, &free_list); - free_pfns++; - spin_unlock_irqrestore(&free_list_lock, flags); - } + free_shadow_page( current, spfn ); } -unsigned long shadow_l2_table( unsigned long gpfn ) +static unsigned long shadow_l2_table( + struct task_struct *p, unsigned long gpfn ) { struct pfn_info *spfn_info; unsigned long spfn; l2_pgentry_t *spl2e, *gpl2e; int i; - MEM_VVLOG("shadow_l2_table( %08lx )",gpfn); + SH_VVLOG("shadow_l2_table( %08lx )",gpfn); + spin_lock(&p->mm.shadow_lock); perfc_incrc(shadow_l2_table_count); perfc_incr(shadow_l2_pages); @@ -129,17 +213,14 @@ unsigned long shadow_l2_table( unsigned long gpfn ) // XXX in future, worry about racing in SMP guests // -- use cmpxchg with PSH_pending flag to show progress (and spin) - spfn_info = alloc_domain_page( NULL ); // XXX account properly later + spfn_info = alloc_shadow_page(p); ASSERT( spfn_info ); // XXX deal with failure later e.g. blow cache spfn = (unsigned long) (spfn_info - frame_table); // mark pfn as being shadowed, update field to point at shadow - frame_table[gpfn].shadow_and_flags = spfn | PSH_shadowed; - - // mark shadow pfn as being a shadow, update field to point at pfn - frame_table[spfn].shadow_and_flags = gpfn | PSH_shadow; + set_shadow_status(p, gpfn, spfn | PSH_shadowed); // we need to do this before the linear map is set up spl2e = (l2_pgentry_t *) map_domain_mem(spfn << PAGE_SHIFT); @@ -172,11 +253,11 @@ unsigned long shadow_l2_table( unsigned long gpfn ) if (gpte & _PAGE_PRESENT) { unsigned long s_sh = - frame_table[ gpte>>PAGE_SHIFT ].shadow_and_flags; + __shadow_status(p, gpte>>PAGE_SHIFT); if( s_sh & PSH_shadowed ) // PSH_shadowed { - if ( unlikely( (frame_table[gpte>>PAGE_SHIFT].type_and_flags & PGT_type_mask) == PGT_l2_page_table) ) + if ( unlikely( (__shadow_status(p, gpte>>PAGE_SHIFT) & PGT_type_mask) == PGT_l2_page_table) ) { printk("Linear mapping detected\n"); spte = gpte & ~_PAGE_RW; @@ -203,33 +284,61 @@ unsigned long shadow_l2_table( unsigned long gpfn ) unmap_domain_mem( gpl2e ); unmap_domain_mem( spl2e ); - MEM_VLOG("shadow_l2_table( %08lx -> %08lx)",gpfn,spfn); - + SH_VLOG("shadow_l2_table( %08lx -> %08lx)",gpfn,spfn); + spin_unlock(&p->mm.shadow_lock); return spfn; } +pagetable_t shadow_mk_pagetable( struct task_struct *p, + unsigned long gptbase) +{ + unsigned long gpfn, spfn=0; + + SH_VVLOG("shadow_mk_pagetable( gptbase=%08lx, mode=%d )", + gptbase, p->mm.shadow_mode ); + + if ( likely(p->mm.shadow_mode) ) // should always be true if we're here + { + gpfn = gptbase >> PAGE_SHIFT; + + if ( unlikely((spfn=__shadow_status(p, gpfn)) == 0 ) ) + { + spfn = shadow_l2_table(p, gpfn ); + } + } + + SH_VVLOG("leaving shadow_mk_pagetable( gptbase=%08lx, mode=%d )", + gptbase, p->mm.shadow_mode ); + + return mk_pagetable(spfn<mm.pagetable, "pre-sf" ); + spin_lock(¤t->mm.shadow_lock); + + check_pagetable( current, current->mm.pagetable, "pre-sf" ); if ( unlikely(__get_user(gpte, (unsigned long*)&linear_pg_table[va>>PAGE_SHIFT])) ) { - MEM_VVLOG("shadow_fault - EXIT: read gpte faulted" ); + SH_VVLOG("shadow_fault - EXIT: read gpte faulted" ); + spin_unlock(¤t->mm.shadow_lock); return 0; // propagate to guest } if ( ! (gpte & _PAGE_PRESENT) ) { - MEM_VVLOG("shadow_fault - EXIT: gpte not present (%lx)",gpte ); + SH_VVLOG("shadow_fault - EXIT: gpte not present (%lx)",gpte ); + spin_unlock(¤t->mm.shadow_lock); return 0; // we're not going to be able to help } + spte = gpte; if ( error_code & 2 ) @@ -242,7 +351,8 @@ int shadow_fault( unsigned long va, long error_code ) } else { // write fault on RO page - MEM_VVLOG("shadow_fault - EXIT: write fault on RO page (%lx)",gpte ); + SH_VVLOG("shadow_fault - EXIT: write fault on RO page (%lx)",gpte ); + spin_unlock(¤t->mm.shadow_lock); return 0; // propagate to guest // not clear whether we should set accessed bit here... } @@ -255,7 +365,7 @@ int shadow_fault( unsigned long va, long error_code ) spte &= ~_PAGE_RW; // force clear unless already dirty } - MEM_VVLOG("plan: gpte=%08lx spte=%08lx", gpte, spte ); + SH_VVLOG("plan: gpte=%08lx spte=%08lx", gpte, spte ); // write back updated gpte // XXX watch out for read-only L2 entries! (not used in Linux) @@ -269,13 +379,13 @@ int shadow_fault( unsigned long va, long error_code ) unsigned long gpde, spde, gl1pfn, sl1pfn; - MEM_VVLOG("3: not shadowed or l2 insufficient gpte=%08lx spte=%08lx",gpte,spte ); + SH_VVLOG("3: not shadowed or l2 insufficient gpte=%08lx spte=%08lx",gpte,spte ); gpde = l2_pgentry_val(linear_l2_table[va>>L2_PAGETABLE_SHIFT]); gl1pfn = gpde>>PAGE_SHIFT; - if ( ! (frame_table[gl1pfn].shadow_and_flags & PSH_shadowed ) ) + if ( ! (sl1pfn=__shadow_status(current, gl1pfn) ) ) { // this L1 is NOT already shadowed so we need to shadow it struct pfn_info *sl1pfn_info; @@ -284,12 +394,11 @@ int shadow_fault( unsigned long va, long error_code ) sl1pfn_info = alloc_domain_page( NULL ); // XXX account properly! sl1pfn = sl1pfn_info - frame_table; - MEM_VVLOG("4a: l1 not shadowed ( %08lx )",sl1pfn); + SH_VVLOG("4a: l1 not shadowed ( %08lx )",sl1pfn); perfc_incrc(shadow_l1_table_count); perfc_incr(shadow_l1_pages); - sl1pfn_info->shadow_and_flags = PSH_shadow | gl1pfn; - frame_table[gl1pfn].shadow_and_flags = PSH_shadowed | sl1pfn; + set_shadow_status(current, gl1pfn, PSH_shadowed | sl1pfn); gpde = gpde | _PAGE_ACCESSED | _PAGE_DIRTY; spde = (gpde & ~PAGE_MASK) | _PAGE_RW | (sl1pfn<mm.pagetable, "post-sf" ); + check_pagetable( current, current->mm.pagetable, "post-sf" ); + + spin_unlock(¤t->mm.shadow_lock); return 1; // let's try the faulting instruction again... @@ -373,13 +482,13 @@ void shadow_l1_normal_pt_update( unsigned long pa, unsigned long gpte, l1_pgentry_t * spl1e, * prev_spl1e = *prev_spl1e_ptr; -MEM_VVLOG("shadow_l1_normal_pt_update pa=%08lx, gpte=%08lx, prev_spfn=%08lx, prev_spl1e=%08lx\n", +SH_VVLOG("shadow_l1_normal_pt_update pa=%08lx, gpte=%08lx, prev_spfn=%08lx, prev_spl1e=%08lx\n", pa,gpte,prev_spfn, prev_spl1e); // to get here, we know the l1 page *must* be shadowed gpfn = pa >> PAGE_SHIFT; - spfn = frame_table[gpfn].shadow_and_flags & PSH_pfn_mask; + spfn = __shadow_status(current, gpfn) & PSH_pfn_mask; if ( spfn == prev_spfn ) { @@ -417,21 +526,23 @@ void shadow_l2_normal_pt_update( unsigned long pa, unsigned long gpte ) { unsigned long gpfn, spfn, spte; l2_pgentry_t * sp2le; - unsigned long s_sh; + unsigned long s_sh=0; - MEM_VVLOG("shadow_l2_normal_pt_update pa=%08lx, gpte=%08lx",pa,gpte); + SH_VVLOG("shadow_l2_normal_pt_update pa=%08lx, gpte=%08lx",pa,gpte); // to get here, we know the l2 page has a shadow gpfn = pa >> PAGE_SHIFT; - spfn = frame_table[gpfn].shadow_and_flags & PSH_pfn_mask; + spfn = __shadow_status(current, gpfn) & PSH_pfn_mask; - sp2le = (l2_pgentry_t *) map_domain_mem( spfn << PAGE_SHIFT ); - // no real need for a cache here spte = 0; - s_sh = frame_table[gpte >> PAGE_SHIFT].shadow_and_flags; + if( gpte & _PAGE_PRESENT ) + s_sh = __shadow_status(current, gpte >> PAGE_SHIFT); + + sp2le = (l2_pgentry_t *) map_domain_mem( spfn << PAGE_SHIFT ); + // no real need for a cache here if ( s_sh ) // PSH_shadowed { @@ -463,7 +574,8 @@ char * sh_check_name; #define FAIL(_f, _a...) \ {printk("XXX %s-FAIL (%d,%d)" _f " g=%08lx s=%08lx\n", sh_check_name, level, i, ## _a , gpte, spte ); BUG();} -int check_pte( unsigned long gpte, unsigned long spte, int level, int i ) +static int check_pte( struct task_struct *p, + unsigned long gpte, unsigned long spte, int level, int i ) { unsigned long mask, gpfn, spfn; @@ -504,42 +616,24 @@ int check_pte( unsigned long gpte, unsigned long spte, int level, int i ) if ( level > 1 ) FAIL("Linear map ???"); // XXX this will fail on BSD -#if 0 // might be a RO mapping of a page table page - if ( frame_table[gpfn].shadow_and_flags != 0 ) - { - FAIL("Should have been shadowed g.sf=%08lx s.sf=%08lx", - frame_table[gpfn].shadow_and_flags, - frame_table[spfn].shadow_and_flags); - } - else -#endif - return 1; + return 1; } else { if ( level < 2 ) FAIL("Shadow in L1 entry?"); - if ( frame_table[gpfn].shadow_and_flags != (PSH_shadowed | spfn) ) - FAIL("spfn problem g.sf=%08lx s.sf=%08lx [g.sf]=%08lx [s.sf]=%08lx", - frame_table[gpfn].shadow_and_flags, - frame_table[spfn].shadow_and_flags, - frame_table[frame_table[gpfn].shadow_and_flags&PSH_pfn_mask].shadow_and_flags, - frame_table[frame_table[spfn].shadow_and_flags&PSH_pfn_mask].shadow_and_flags - ); - - if ( frame_table[spfn].shadow_and_flags != (PSH_shadow | gpfn) ) - FAIL("gpfn problem g.sf=%08lx s.sf=%08lx", - frame_table[gpfn].shadow_and_flags, - frame_table[spfn].shadow_and_flags); - + if ( __shadow_status(p, gpfn) != (PSH_shadowed | spfn) ) + FAIL("spfn problem g.sf=%08lx", + __shadow_status(p, gpfn) ); } return 1; } -int check_l1_table( unsigned long va, unsigned long g2, unsigned long s2 ) +static int check_l1_table( struct task_struct *p, unsigned long va, + unsigned long g2, unsigned long s2 ) { int j; unsigned long *gpl1e, *spl1e; @@ -555,7 +649,7 @@ int check_l1_table( unsigned long va, unsigned long g2, unsigned long s2 ) unsigned long gpte = gpl1e[j]; unsigned long spte = spl1e[j]; - check_pte( gpte, spte, 1, j ); + check_pte( p, gpte, spte, 1, j ); } unmap_domain_mem( spl1e ); @@ -567,7 +661,7 @@ int check_l1_table( unsigned long va, unsigned long g2, unsigned long s2 ) #define FAILPT(_f, _a...) \ {printk("XXX FAIL %s-PT" _f "\n", s, ## _a ); BUG();} -int check_pagetable( pagetable_t pt, char *s ) +int check_pagetable( struct task_struct *p, pagetable_t pt, char *s ) { unsigned long gptbase = pagetable_val(pt); unsigned long gpfn, spfn; @@ -576,29 +670,26 @@ int check_pagetable( pagetable_t pt, char *s ) sh_check_name = s; - MEM_VVLOG("%s-PT Audit",s); + SH_VVLOG("%s-PT Audit",s); sh_l2_present = sh_l1_present = 0; gpfn = gptbase >> PAGE_SHIFT; - if ( ! (frame_table[gpfn].shadow_and_flags & PSH_shadowed) ) + if ( ! (__shadow_status(p, gpfn) & PSH_shadowed) ) { printk("%s-PT %08lx not shadowed\n", s, gptbase); - if( frame_table[gpfn].shadow_and_flags != 0 ) BUG(); + if( __shadow_status(p, gpfn) != 0 ) BUG(); return 0; } - spfn = frame_table[gpfn].shadow_and_flags & PSH_pfn_mask; + spfn = __shadow_status(p, gpfn) & PSH_pfn_mask; - if ( ! frame_table[gpfn].shadow_and_flags == (PSH_shadowed | spfn) ) + if ( ! __shadow_status(p, gpfn) == (PSH_shadowed | spfn) ) FAILPT("ptbase shadow inconsistent1"); - if ( ! frame_table[spfn].shadow_and_flags == (PSH_shadow | gpfn) ) - FAILPT("ptbase shadow inconsistent2"); - gpl2e = (l2_pgentry_t *) map_domain_mem( gpfn << PAGE_SHIFT ); spl2e = (l2_pgentry_t *) map_domain_mem( spfn << PAGE_SHIFT ); @@ -641,7 +732,7 @@ int check_pagetable( pagetable_t pt, char *s ) unsigned long gpte = l2_pgentry_val(gpl2e[i]); unsigned long spte = l2_pgentry_val(spl2e[i]); - check_pte( gpte, spte, 2, i ); + check_pte( p, gpte, spte, 2, i ); } @@ -652,7 +743,7 @@ int check_pagetable( pagetable_t pt, char *s ) unsigned long spte = l2_pgentry_val(spl2e[i]); if ( spte ) - check_l1_table( + check_l1_table( p, i<>PAGE_SHIFT, spte>>PAGE_SHIFT ); @@ -661,7 +752,7 @@ int check_pagetable( pagetable_t pt, char *s ) unmap_domain_mem( spl2e ); unmap_domain_mem( gpl2e ); - MEM_VVLOG("PT verified : l2_present = %d, l1_present = %d\n", + SH_VVLOG("PT verified : l2_present = %d, l1_present = %d\n", sh_l2_present, sh_l1_present ); return 1; @@ -671,7 +762,6 @@ int check_pagetable( pagetable_t pt, char *s ) #endif -#endif // CONFIG_SHADOW diff --git a/xen/include/asm-i386/config.h b/xen/include/asm-i386/config.h index 0496f481d9..e5b380618f 100644 --- a/xen/include/asm-i386/config.h +++ b/xen/include/asm-i386/config.h @@ -40,8 +40,6 @@ #define CONFIG_XEN_ATTENTION_KEY 1 -#define CONFIG_SHADOW 1 - #define HZ 100 diff --git a/xen/include/asm-i386/processor.h b/xen/include/asm-i386/processor.h index 9766ac7b20..7cf48541ef 100644 --- a/xen/include/asm-i386/processor.h +++ b/xen/include/asm-i386/processor.h @@ -12,6 +12,7 @@ #include #include #include +#include #include struct task_struct; @@ -416,10 +417,14 @@ struct mm_struct { l1_pgentry_t *perdomain_pt; pagetable_t pagetable; -#ifdef CONFIG_SHADOW - unsigned int shadowmode; /* flags to control shadow table operation */ - pagetable_t shadowtable; -#endif + unsigned int shadow_mode; /* flags to control shadow table operation */ + pagetable_t shadow_table; + spinlock_t shadow_lock; + struct shadow_status *shadow_ht; + struct shadow_status *shadow_ht_free; + struct shadow_status *shadow_ht_extras; // extra allocation units + unsigned int shadow_page_count; + unsigned int shadow_max_page_count; /* Current LDT details. */ unsigned long ldt_base, ldt_ents, shadow_ldt_mapcnt; diff --git a/xen/include/xeno/mm.h b/xen/include/xeno/mm.h index c1df341a28..68eca9807e 100644 --- a/xen/include/xeno/mm.h +++ b/xen/include/xeno/mm.h @@ -67,10 +67,6 @@ struct pfn_info unsigned long type_and_flags; /* Timestamp from 'TLB clock', used to reduce need for safety flushes. */ unsigned long tlbflush_timestamp; -#ifdef CONFIG_SHADOW - /* Shadow page status: top bits flags, bottom bits are a pfn */ - unsigned long shadow_and_flags; -#endif }; /* The following page types are MUTUALLY EXCLUSIVE. */ diff --git a/xen/include/xeno/shadow.h b/xen/include/xeno/shadow.h index 7034081b48..212a0dbfbb 100644 --- a/xen/include/xeno/shadow.h +++ b/xen/include/xeno/shadow.h @@ -3,15 +3,13 @@ #ifndef _XENO_SHADOW_H #define _XENO_SHADOW_H -#ifdef CONFIG_SHADOW - #include #include #include +#include /* Shadow PT flag bits in pfn_info */ #define PSH_shadowed (1<<31) /* page has a shadow. PFN points to shadow */ -#define PSH_shadow (1<<30) /* page is a shadow. PFN points to orig page */ #define PSH_pending (1<<29) /* page is in the process of being shadowed */ #define PSH_pfn_mask ((1<<21)-1) @@ -24,28 +22,323 @@ #define shadow_linear_pg_table ((l1_pgentry_t *)SH_LINEAR_PT_VIRT_START) #define shadow_linear_l2_table ((l2_pgentry_t *)(SH_LINEAR_PT_VIRT_START+(SH_LINEAR_PT_VIRT_START>>(L2_PAGETABLE_SHIFT-L1_PAGETABLE_SHIFT)))) -extern pagetable_t shadow_mk_pagetable( unsigned long gptbase, unsigned int shadowmode ); -extern void unshadow_table( unsigned long gpfn, unsigned int type ); -extern unsigned long shadow_l2_table( unsigned long gpfn ); +extern pagetable_t shadow_mk_pagetable( struct task_struct *p, + unsigned long gptbase); extern int shadow_fault( unsigned long va, long error_code ); extern void shadow_l1_normal_pt_update( unsigned long pa, unsigned long gpte, unsigned long *prev_spfn_ptr, l1_pgentry_t **prev_spl1e_ptr ); extern void shadow_l2_normal_pt_update( unsigned long pa, unsigned long gpte ); - +extern void unshadow_table( unsigned long gpfn, unsigned int type ); +extern int shadow_mode_enable( struct task_struct *p, unsigned int mode ); #define SHADOW_DEBUG 0 +#define SHADOW_HASH_DEBUG 0 #define SHADOW_OPTIMISE 1 -#endif // end of CONFIG_SHADOW +struct shadow_status { + unsigned long pfn; // gpfn + unsigned long spfn_and_flags; // spfn plus flags + struct shadow_status *next; // use pull-to-front list. +}; + +#define shadow_ht_extra_size 128 /*128*/ +#define shadow_ht_buckets 256 /*256*/ + +#ifndef NDEBUG +#define SH_LOG(_f, _a...) \ + printk("DOM%llu: (file=shadow.c, line=%d) " _f "\n", \ + current->domain , __LINE__ , ## _a ) +#else +#define SH_LOG(_f, _a...) +#endif #if SHADOW_DEBUG -extern int check_pagetable( pagetable_t pt, char *s ); +#define SH_VLOG(_f, _a...) \ + printk("DOM%llu: (file=shadow.c, line=%d) " _f "\n", \ + current->domain , __LINE__ , ## _a ) +#else +#define SH_VLOG(_f, _a...) +#endif + +#if 0 +#define SH_VVLOG(_f, _a...) \ + printk("DOM%llu: (file=shadow.c, line=%d) " _f "\n", \ + current->domain , __LINE__ , ## _a ) +#else +#define SH_VVLOG(_f, _a...) +#endif + + + +#if SHADOW_HASH_DEBUG +static void shadow_audit(struct task_struct *p, int print) +{ + int live=0, free=0, j=0, abs; + struct shadow_status *a; + + for(j=0;jmm.shadow_ht[j]; + if(a->pfn) live++; + while(a->next && live<9999) + { + live++; + if(a->pfn == 0) + { + printk("XXX live=%d pfn=%08lx sp=%08lx next=%p\n", + live, a->pfn, a->spfn_and_flags, a->next); + BUG(); + } + a=a->next; + } + ASSERT(live<9999); + } + + a = p->mm.shadow_ht_free; + while(a) { free++; a=a->next; } + + if(print) printk("live=%d free=%d\n",live,free); + + abs=(perfc_value(shadow_l1_pages)+perfc_value(shadow_l2_pages))-live; + if( abs < -1 || abs > 1 ) + { + printk("live=%d free=%d l1=%d l2=%d\n",live,free, + perfc_value(shadow_l1_pages), perfc_value(shadow_l2_pages) ); + BUG(); + } + +} + #else -#define check_pagetable( pt, s ) +#define shadow_audit(p, print) +#endif + +static inline struct shadow_status* hash_bucket( struct task_struct *p, + unsigned int gpfn ) +{ + return &(p->mm.shadow_ht[gpfn % shadow_ht_buckets]); +} + + +static inline unsigned long __shadow_status( struct task_struct *p, + unsigned int gpfn ) +{ + struct shadow_status **ob, *b, *B = hash_bucket( p, gpfn ); + + b = B; + ob = NULL; + + SH_VVLOG("lookup gpfn=%08lx bucket=%08lx", gpfn, b ); + shadow_audit(p,0); // if in debug mode + + do + { + if ( b->pfn == gpfn ) + { + unsigned long t; + struct shadow_status *x; + + // swap with head + t=B->pfn; B->pfn=b->pfn; b->pfn=t; + t=B->spfn_and_flags; B->spfn_and_flags=b->spfn_and_flags; + b->spfn_and_flags=t; + + if(ob) + { // pull to front + *ob=b->next; + x=B->next; + B->next=b; + b->next=x; + } + return B->spfn_and_flags; + } +#if SHADOW_HASH_DEBUG + else + { + if(b!=B)ASSERT(b->pfn); + } #endif + ob=&b->next; + b=b->next; + } + while (b); + + return 0; +} + +/* we can make this locking more fine grained e.g. per shadow page if it +ever becomes a problem, but since we need a spin lock on the hash table +anyway its probably not worth being too clever. */ + +static inline unsigned long get_shadow_status( struct task_struct *p, + unsigned int gpfn ) +{ + unsigned long res; + + spin_lock(&p->mm.shadow_lock); + res = __shadow_status( p, gpfn ); + if (!res) spin_unlock(&p->mm.shadow_lock); + return res; +} + + +static inline void put_shadow_status( struct task_struct *p ) +{ + spin_unlock(&p->mm.shadow_lock); +} +static inline void delete_shadow_status( struct task_struct *p, + unsigned int gpfn ) +{ + struct shadow_status *b, *B, **ob; + + B = b = hash_bucket( p, gpfn ); + + SH_VVLOG("delete gpfn=%08x bucket=%p", gpfn, b ); + shadow_audit(p,0); + ASSERT(gpfn); + + if( b->pfn == gpfn ) + { + if (b->next) + { + struct shadow_status *D=b->next; + b->spfn_and_flags = b->next->spfn_and_flags; + b->pfn = b->next->pfn; + + b->next = b->next->next; + D->next = p->mm.shadow_ht_free; + p->mm.shadow_ht_free = D; + } + else + { + b->pfn = 0; + b->spfn_and_flags = 0; + } + +#if SHADOW_HASH_DEBUG + if( __shadow_status(p,gpfn) ) BUG(); +#endif + return; + } + + ob = &b->next; + b=b->next; + + do + { + if ( b->pfn == gpfn ) + { + b->pfn = 0; + b->spfn_and_flags = 0; + + // b is in the list + *ob=b->next; + b->next = p->mm.shadow_ht_free; + p->mm.shadow_ht_free = b; + +#if SHADOW_HASH_DEBUG + if( __shadow_status(p,gpfn) ) BUG(); +#endif + return; + } + + ob = &b->next; + b=b->next; + } + while (b); + + // if we got here, it wasn't in the list + BUG(); +} + + +static inline void set_shadow_status( struct task_struct *p, + unsigned int gpfn, unsigned long s ) +{ + struct shadow_status *b, *B, *extra, **fptr; + int i; + + B = b = hash_bucket( p, gpfn ); + + ASSERT(gpfn); + ASSERT(s); + SH_VVLOG("set gpfn=%08x s=%08lx bucket=%p(%p)", gpfn, s, b, b->next ); + shadow_audit(p,0); + + do + { + if ( b->pfn == gpfn ) + { + b->spfn_and_flags = s; + return; + } + + b=b->next; + } + while (b); + + // if we got here, this is an insert rather than update + + ASSERT( s ); // deletes must have succeeded by here + + if ( B->pfn == 0 ) + { + // we can use this head + ASSERT( B->next == 0 ); + B->pfn = gpfn; + B->spfn_and_flags = s; + return; + } + + if( unlikely(p->mm.shadow_ht_free == NULL) ) + { + SH_LOG("allocate more shadow hashtable blocks"); + + // we need to allocate more space + extra = kmalloc( sizeof(void*) + (shadow_ht_extra_size * + sizeof(struct shadow_status)), GFP_KERNEL ); + + if( ! extra ) BUG(); // should be more graceful here.... + + memset( extra, 0, sizeof(void*) + (shadow_ht_extra_size * + sizeof(struct shadow_status)) ); + + // add extras to free list + fptr = &p->mm.shadow_ht_free; + for ( i=0; imm.shadow_ht[shadow_ht_extra_size]) = + p->mm.shadow_ht_extras; + p->mm.shadow_ht_extras = extra; + + } + + // should really put this in B to go right to front + b = p->mm.shadow_ht_free; + p->mm.shadow_ht_free = b->next; + b->spfn_and_flags = s; + b->pfn = gpfn; + b->next = B->next; + B->next = b; + + return; +} + + + +#if SHADOW_DEBUG +extern int check_pagetable( struct task_struct *p, pagetable_t pt, char *s ); +#else +#define check_pagetable( p, pt, s ) +#endif #endif diff --git a/xen/net/dev.c b/xen/net/dev.c index bb25e6a2b9..5d692f51f4 100644 --- a/xen/net/dev.c +++ b/xen/net/dev.c @@ -494,6 +494,7 @@ void deliver_packet(struct sk_buff *skb, net_vif_t *vif) unsigned short size; unsigned char offset, status = RING_STATUS_OK; struct task_struct *p = vif->domain; + unsigned long spte_pfn; memcpy(skb->mac.ethernet->h_dest, vif->vmac, ETH_ALEN); if ( ntohs(skb->mac.ethernet->h_proto) == ETH_P_ARP ) @@ -546,21 +547,18 @@ void deliver_packet(struct sk_buff *skb, net_vif_t *vif) goto out; } - -#ifdef CONFIG_SHADOW - if ( pte_page->shadow_and_flags & PSH_shadowed ) + if ( p->mm.shadow_mode && + (spte_pfn=get_shadow_status(p, pte_page-frame_table)) ) { - unsigned long spte_pfn = pte_page->shadow_and_flags & PSH_pfn_mask; unsigned long *sptr = map_domain_mem( (spte_pfn<rx_lock); @@ -2114,21 +2112,16 @@ static void get_rx_bufs(net_vif_t *vif) goto rx_unmap_and_continue; } -#ifdef CONFIG_SHADOW - { - if ( frame_table[rx.addr>>PAGE_SHIFT].shadow_and_flags & PSH_shadowed ) - { - unsigned long spfn = - frame_table[rx.addr>>PAGE_SHIFT].shadow_and_flags & PSH_pfn_mask; - unsigned long * sptr = map_domain_mem( (spfn<mm.shadow_mode && + (spfn=get_shadow_status(p, rx.addr>>PAGE_SHIFT)) ) + { + unsigned long * sptr = + map_domain_mem( (spfn<> PAGE_SHIFT; buf_page = &frame_table[buf_pfn]; -- 2.30.2